library(tidyverse)  # data manipulation and visualization
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
#install.packages("gridExtra")
library(gridExtra)  # plot arrangement
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
customer_movie_rating <- read.csv("customer_movie_rating.csv")
print(customer_movie_rating)
##     Horror Romcom Action Comedy Fantasy
## 1     72.5   29.9   68.6   40.7    57.9
## 2     82.2   45.3   76.5   17.4    67.7
## 3     70.0   44.0   65.1   53.7    37.8
## 4     99.1   21.0   77.9   25.4    40.3
## 5     84.0    0.0   68.1   49.8    40.0
## 6     70.2   55.0   97.2   48.1    40.5
## 7     85.8   33.0   84.6   31.0    40.6
## 8     88.9   31.5   86.9   36.8    41.5
## 9     86.9   24.8   80.6   17.7    62.3
## 10    76.3   31.1   96.2   33.9    36.6
## 11    98.1   23.0   68.4   42.5    40.3
## 12    84.7   30.0   70.5   31.2    52.8
## 13    72.5   20.2   93.2   24.6    58.5
## 14    53.4    8.6   68.2   22.1    35.7
## 15    93.5   36.9   73.5   29.6    38.9
## 16    79.5   43.2   71.3   42.8    51.3
## 17    79.8   21.3   72.2   24.2    57.2
## 18    91.3   10.0   72.7   28.5    42.0
## 19    89.9   32.7   81.9   14.5    27.8
## 20    87.1   24.5   73.9   35.9    65.5
## 21    91.0    4.2   69.9   45.7    62.2
## 22    89.4   25.0   92.1   48.0    36.5
## 23    80.9   17.4   73.4   39.8    44.2
## 24    56.1   20.9   73.8    7.6    23.9
## 25    87.4   11.1   74.8   35.8    51.8
## 26    79.3   46.6   84.6   35.5    64.3
## 27    78.1   21.0   75.1   25.8    25.4
## 28    62.4    5.7   75.5   32.0    35.6
## 29    74.3   27.4   67.8   19.6    37.3
## 30    85.0   28.2   72.1   38.2    36.8
## 31    96.3   13.2   76.7   26.1    20.6
## 32    78.8    0.0   68.9   11.2    51.0
## 33    84.7   17.3   82.4   25.6    26.6
## 34    79.4   31.8   57.8   46.4    44.7
## 35    63.5   24.3   79.7   26.0    52.1
## 36    75.0   23.8   57.6   38.8    42.6
## 37    75.3   31.7   72.4   41.4    55.7
## 38    79.3   10.8   69.7   30.1    44.7
## 39    93.2   38.2   68.2   25.8    37.2
## 40    89.2   24.9   75.3   23.6    52.8
## 41    78.0   33.5   53.0   38.9    39.8
## 42    77.0   37.4   90.1   17.2    66.3
## 43    88.4   27.7   56.0   33.0    44.8
## 44    86.7   14.5   70.4   26.5    55.2
## 45    71.7   39.0   62.6    2.8    47.5
## 46    71.5    1.0   67.0   13.1     8.9
## 47    84.4   18.5  100.0   41.0    28.6
## 48    89.2   21.9   76.2   27.7    39.9
## 49    78.7   23.0   60.6   39.6    47.8
## 50    90.6   37.2   56.3   52.6    16.9
## 51    84.8   26.6   81.4   47.7    56.5
## 52    72.7   29.9   75.8   38.1    37.7
## 53    84.1   24.2   72.2   34.6    36.0
## 54    66.4   22.0   64.8   27.7    26.3
## 55    97.2   33.3   58.2   48.9    27.6
## 56   100.0   38.8   63.1   37.2    45.7
## 57    75.6    0.0   88.0   15.9    51.1
## 58    67.5   31.9   68.5   28.1    19.8
## 59    86.8   29.5   59.4    7.0    32.9
## 60    78.4   19.9   98.4   27.7    51.4
## 61   100.0   36.4   81.1    0.0    39.6
## 62    79.5   20.3   73.1   45.8    71.0
## 63    88.3   21.6   88.7   22.4    59.9
## 64    80.3   35.3   86.6   24.8    52.1
## 65    71.1   45.6   68.6   28.0    45.1
## 66    82.3   28.2  100.0   37.3    48.4
## 67    58.3   19.9   72.9   38.1    36.5
## 68    97.6   10.7   58.9   36.8    52.5
## 69    81.8   21.0   74.3   23.1    62.8
## 70   100.0   13.7   78.5   13.6    58.0
## 71    85.7   21.9  100.0   25.3    35.2
## 72    71.5   29.7   77.3   33.3    25.6
## 73    87.3   14.8   81.5   20.1    43.7
## 74    68.8   56.8   75.1   29.2    50.3
## 75    65.0   26.9   72.0   16.0    61.2
## 76    83.5   38.6   75.6   29.9    29.2
## 77    74.7    0.0   85.5   31.5    49.4
## 78    80.0   33.9  100.0   28.2    47.8
## 79    80.9    9.2   88.3   28.0    59.3
## 80    72.9   36.0   90.5   51.2    44.7
## 81    73.2   29.8   61.2   39.2    40.7
## 82    78.4   20.1   87.8   43.3    31.2
## 83    94.1   40.9   78.6   18.9    38.8
## 84    61.7   16.6   58.4   32.0    40.7
## 85    87.1   18.0   82.3   43.9    73.2
## 86    84.0   13.0   74.1   29.3    74.4
## 87    92.8   17.0   93.6    4.4    43.0
## 88    76.3   36.3   66.8   34.1    32.5
## 89    84.4   30.2   70.8    7.1    21.3
## 90    83.2   37.1   64.9   20.3    51.2
## 91    73.5   20.3   73.9   45.9    31.9
## 92    94.5   29.5   80.8   37.4    72.4
## 93    93.9   27.9   67.2   43.1    34.4
## 94    88.4    7.9   86.0   33.7    46.3
## 95    99.0   46.3   61.5   28.7    90.7
## 96    86.7   26.6   63.4   18.9    31.7
## 97    64.7   34.2   93.3   49.1    48.7
## 98    73.1   36.5   63.8   30.5    31.7
## 99    65.3   24.4   80.9   21.4    49.2
## 100   74.3   21.3   71.4   40.4    34.5
## 101   25.9   25.8   38.1   93.8    65.9
## 102   21.4   50.9   60.2   90.6    88.8
## 103   10.8   63.6   59.7   72.1    64.9
## 104   25.1   47.1   44.4   60.3    79.7
## 105   36.9   54.6   60.2   71.1    85.7
## 106   44.1   35.9   61.8   46.1    59.0
## 107    8.5   65.2   56.9   71.2    79.8
## 108   22.0   32.8   74.3   94.9    73.7
## 109   38.9   51.1   26.5   76.6    83.1
## 110   11.6   51.9   36.0   88.2    65.5
## 111    0.0   55.7   33.5   80.9    74.0
## 112   13.8   45.1   52.0   65.7    91.6
## 113   13.4   77.0   69.0   95.9    77.0
## 114   25.6   37.8   70.1   74.1    84.9
## 115   20.2   58.0   55.9   63.3    72.3
## 116   27.8   19.7   60.5   75.8    62.6
## 117   19.9   55.0   48.3   56.8    74.9
## 118   29.5   61.1   55.6   85.4    60.3
## 119   20.6   42.0   54.5   81.0    43.8
## 120   39.3   63.5   40.9   70.7    89.0
## 121   16.2   59.0   46.5   69.1    62.0
## 122   28.5   66.7   48.4   86.2    53.1
## 123   14.4   44.9   66.7   62.3    86.9
## 124   27.5   43.4   37.6   63.2    74.9
## 125   24.4   33.7   24.6   80.1    67.8
## 126    4.8   51.1   59.2   69.6    72.9
## 127   23.3   38.9   40.2   86.1    69.9
## 128   39.2   63.3   44.8   72.6    87.0
## 129   33.2   55.9   60.9   89.3    83.7
## 130   26.7   81.3   40.8   80.9    54.3
## 131   10.7   41.1   45.9   48.1    79.2
## 132   39.0   69.2   68.0   59.0    83.7
## 133   26.0   48.7   56.3   90.4    83.0
## 134   19.6   37.5   56.5   83.3    45.9
## 135   44.7   61.9   48.4   63.4    72.2
## 136   15.8   37.8   36.4   58.9    98.8
## 137   28.6   42.3   32.0   87.4    84.6
## 138   40.4   46.2   47.3   65.3    54.5
## 139   32.2   57.5   74.0   96.6    55.0
## 140   21.3   43.0   52.7   96.3    80.9
## 141   20.0   67.9   52.0   57.5    72.9
## 142   29.3   40.6   54.0   64.9    86.5
## 143   31.2   57.6   45.4   60.0    78.5
## 144   25.2   56.7   33.2   83.0    76.0
## 145   40.8   42.2   82.1   59.5    77.2
## 146   24.2   49.9   44.9   50.6    77.0
## 147   16.6   47.1   46.4   99.3    59.8
## 148   31.4   66.5   28.5   87.1   100.0
## 149    0.0   73.7   47.0   84.8    58.1
## 150   29.7   42.5   47.0   67.0    74.8
## 151   31.0   66.2   46.9   74.9    68.5
## 152   22.3   54.1   28.6   82.4    96.6
## 153   11.6   31.4   71.4   59.6    87.1
## 154   20.3   45.9   71.2   73.5    68.2
## 155   43.6   60.5   58.3   77.1    77.5
## 156   16.1   56.7   36.8   95.3    89.0
## 157    0.0   25.7   58.6   82.7   100.0
## 158   34.7   62.0   47.0   90.4    78.6
## 159   19.0   62.9   46.2   76.7    62.5
## 160   18.9   51.3   66.4   61.6    63.2
## 161   10.4   46.5   35.3   70.9    99.1
## 162   24.7   78.7   43.9   55.0    50.2
## 163   33.4   53.9   41.2   86.1   100.0
## 164   18.0   54.8   50.2   92.0    71.9
## 165   17.7   41.5   31.1   74.2    69.5
## 166   38.2   38.9   41.6   63.2    76.9
## 167   22.0   36.7   58.6   88.0    86.2
## 168   23.1   49.9   55.6   76.7    78.6
## 169   17.5   71.3   38.3   70.4    51.5
## 170   35.8   76.0   56.7   88.5    79.2
## 171   65.4   46.4   80.5   45.9    23.1
## 172   67.9   33.5   85.5   80.7    20.1
## 173   47.6   20.9   63.5   67.4    27.1
## 174   31.5   24.9   66.5   60.9    22.9
## 175   56.1   23.5   39.0   80.2    25.1
## 176   48.7    0.0   63.5   76.3    33.9
## 177   50.8   45.4   79.6   80.3    16.0
## 178   48.6   37.8   81.1   61.5    39.1
## 179   55.2   15.8   99.3   74.3    30.0
## 180   56.3   29.6   75.8   68.5    29.5
## 181   69.6   27.9   80.6   58.8     5.6
## 182   71.8   11.4   75.9   86.1    26.4
## 183   50.5   42.9   69.2   66.9    23.3
## 184   56.3   22.0   84.0   64.3    40.2
## 185   64.3   27.2   80.6   71.3    46.7
## 186   76.8   29.9   76.6   69.7    44.8
## 187   59.3   13.1   65.2   56.5    16.6
## 188   39.6   12.0   74.5   64.4    20.4
## 189   62.8   24.4   84.4   96.3    28.9
## 190   58.6   31.9   83.1   64.1    17.3
## 191   81.3   25.9   69.2   61.4    10.7
## 192   64.1   33.5   67.0   73.8    39.5
## 193   52.5   29.0   81.2   69.8    30.8
## 194   54.7   31.5   87.6   58.5    37.4
## 195   53.9    8.2   76.5   52.0    15.6
## 196   62.2   33.1   71.2   75.2    25.9
## 197   62.1   15.5   64.4   66.4    15.9
## 198   71.0   19.4   70.0   70.8    20.0
## 199   63.8   23.7   57.2   61.5    32.8
## 200   55.6    5.2   66.6   45.2    20.2
## 201   48.7   23.8   88.6   65.8    18.1
## 202   67.6   19.7   88.3   93.7    14.3
## 203   59.3   16.4   64.6   47.5    12.8
## 204   62.2   18.3   77.5   53.2    19.6
## 205   73.2   39.9   75.8   49.0    50.4
## 206   81.0    9.9   55.0   60.6    22.1
## 207   48.6   22.4   84.7   81.6    16.6
## 208   79.7    0.0   52.1   63.2    13.1
## 209   49.6   16.9   60.0   61.9    28.2
## 210   63.2   19.0   87.0   49.5    25.3
## 211   62.7   43.5   68.5   65.8    33.0
## 212   56.7   13.5   72.4   77.4    26.6
## 213   76.7   14.5   55.5   92.1    40.4
## 214   52.1    8.2   57.6   80.8    38.6
## 215   67.9   27.2   79.2   54.6    42.7
## 216   59.8   38.8   72.9   59.0    32.9
## 217   48.8   10.6   67.9   72.3    43.4
## 218   74.6   19.9   59.0   64.9    29.1
## 219   34.9   41.4   61.8   68.3    41.2
## 220   53.7   16.8   99.4   49.3    28.1
## 221   41.5   33.2   71.1   59.9    21.8
## 222   62.3   29.7   84.3   53.9    15.4
## 223   63.2    9.3   84.4   53.9    21.5
## 224   46.6   39.6   84.2   56.4    21.0
## 225   67.8   34.5   78.5   60.0    36.9
## 226   47.6   19.1   60.0   64.9    29.4
## 227   67.9   14.2   62.9   72.6    22.4
## 228   62.9   20.3   84.0   58.9    19.2
## 229   68.6   34.8   59.3   54.7    48.2
## 230   48.7   18.2   81.3   84.4    27.2
## 231   61.1    2.5   68.6   76.9    15.0
## 232   54.4   23.3   70.2   73.4    24.8
## 233   42.4   34.3   65.5   85.4    31.1
## 234   61.8   11.1   72.2   53.3    40.0
## 235   81.3   22.1   85.5   89.6    29.0
## 236   52.2   10.3   80.4   79.7    31.3
## 237   57.6   26.4   72.2   68.7    17.5
## 238   68.3   23.4   85.4   72.5    39.6
## 239   60.4   24.6   94.9   65.7    31.2
## 240   83.3   17.5   74.9   63.7    38.8
## 241   68.8   16.5   80.6   46.1    48.6
## 242   87.9   35.5   78.3   73.9    32.1
## 243   64.2   26.3   63.3   90.7    26.6
## 244   46.4   22.8   63.9   90.1    20.8
## 245   65.1    0.0   98.0   67.0    23.1
## 246   48.9    9.7   85.6   63.7    19.0
## 247   47.9   34.2   83.9   67.2    34.4
## 248   57.7   20.1   76.8   78.8    12.4
## 249   71.2   10.5   80.8   82.7     9.1
## 250   64.1   19.7   76.8   69.9    22.0
## 251   69.8   20.5   75.5   78.4    43.0
## 252   71.0   19.0   77.7   64.7    31.5
## 253   57.9   31.0   62.9   71.0    24.1
## 254   31.2   43.3  100.0   79.2    38.0
## 255   69.6   36.9   84.6  100.0    25.8
## 256   86.0   40.0   72.0   64.4    28.6
## 257   60.7   21.0   89.6   57.0    30.9
## 258   43.7   35.1   67.5   70.3    43.1
## 259   55.6   13.2   95.5   60.1    45.5
## 260   48.8   23.3   70.3   72.6    45.4
## 261   59.5   51.2   47.1   69.1    41.7
## 262   68.1   24.8   91.4   80.8    34.0
## 263   70.4   21.3   88.6   53.5    52.6
## 264   62.8   18.0   65.7   50.5    54.5
## 265   48.8   34.3   58.1   83.8    45.9
## 266   69.8   50.3   53.0   67.7    27.4
## 267   76.2   29.9   71.8   53.9    17.5
## 268   87.0   21.9   53.0   52.1    32.3
## 269   54.1   49.9   65.2   58.4    23.3
## 270   65.7   15.7   77.0   72.1    33.3
## 271   74.3   38.6   85.3   58.7    23.4
## 272   58.6   19.9   65.2   81.1    41.1
## 273   66.3   12.7   73.5   67.9    56.3
## 274   62.6   39.6   78.1   67.4    41.6
## 275   58.4    3.4   95.6   64.8    38.9
## 276   62.0   21.3   63.5   61.4    30.4
## 277   71.6   25.2   55.7   63.0    56.2
## 278   64.9   19.7   52.9   78.9    29.0
## 279   54.4    5.3   81.7   48.9    46.5
## 280   33.1   17.3   74.3   78.0    48.9
## 281   50.5    6.3   84.3   60.8    31.9
## 282   59.8   48.1   73.3   73.8     0.0
## 283   29.8    2.7   79.7   64.3    26.0
## 284   86.5    0.0   77.7   48.9    35.5
## 285   42.1   33.4   75.3   69.2    14.4
## 286   46.1   35.9   67.5   57.3    14.5
## 287   77.5   22.6   90.1   78.5    40.9
## 288   33.7   22.5   70.1   55.1     0.0
## 289   68.9   33.7   83.0   43.5    32.8
## 290   55.9   41.8   77.0   54.8    26.8
## 291   36.8   51.3   51.1   54.9    52.5
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df <- customer_movie_rating
df <- na.omit(df)
df <- scale(df)
head(df)
##      Horror     Romcom      Action     Comedy    Fantasy
## 1 0.5716500 -0.0791958 -0.01515773 -0.6986964  0.5587483
## 2 0.9696365  0.8275698  0.48122544 -1.7277543  1.0042660
## 3 0.4690762  0.7510247 -0.23507432 -0.1245439 -0.3550175
## 4 1.6630357 -0.6032357  0.56919208 -1.3744297 -0.2413650
## 5 1.0434897 -1.8397343 -0.04657438 -0.2967896 -0.2550033
## 6 0.4772821  1.3987144  1.78187502 -0.3718711 -0.2322728
distance <- get_dist(df)

?get_dist
## starting httpd help server ... done
fviz_dist(distance)

k2 <- kmeans(df, centers = 3, nstart = 25)
str(k2)
## List of 9
##  $ cluster     : Named int [1:291] 3 3 1 3 3 3 3 3 3 3 ...
##   ..- attr(*, "names")= chr [1:291] "1" "2" "3" "4" ...
##  $ centers     : num [1:3, 1:5] 0.0441 -1.3916 0.9064 -0.4249 1.2101 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:5] "Horror" "Romcom" "Action" "Comedy" ...
##  $ totss       : num 1450
##  $ withinss    : num [1:3] 206 156 191
##  $ tot.withinss: num 554
##  $ betweenss   : num 896
##  $ size        : int [1:3] 114 72 105
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
?kmeans
library(factoextra)

fviz_cluster(k2, data = df)

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Horror, Romcom, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Horror, Action, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Horror, Comedy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Horror, Fantasy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Romcom, Action, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Romcom, Comedy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Romcom, Fantasy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Action, Comedy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Action, Fantasy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Comedy, Fantasy, color = factor(cluster), label = state)) +
  geom_text()

k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

library(gridExtra)

grid.arrange(p1, p2, p3, p4, nrow = 2)

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Horror, Romcom, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Horror, Action, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Horror, Comedy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Horror, Fantasy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Romcom, Action, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Romcom, Comedy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Romcom, Fantasy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Action, Comedy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Action, Fantasy, color = factor(cluster), label = state)) +
  geom_text()

df %>%
  as_tibble() %>%
  mutate(cluster = k3$cluster,
         state = row.names(customer_movie_rating)) %>%
  ggplot(aes(Comedy, Fantasy, color = factor(cluster), label = state)) +
  geom_text()

## Ideal Cluster Numbers

set.seed(123)

# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(df, k, nstart = 10 )$tot.withinss}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

set.seed(123)

fviz_nbclust(df, kmeans, method = "wss")

#Average Silhouette Method


fviz_nbclust(df, kmeans, method = "silhouette")

#gap statistic

library(cluster)
# compute gap statistic
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 3
##           logW   E.logW       gap     SE.sim
##  [1,] 5.354803 5.723527 0.3687242 0.01293288
##  [2,] 5.058695 5.541902 0.4832072 0.01242534
##  [3,] 4.899418 5.461444 0.5620263 0.01243341
##  [4,] 4.853367 5.396459 0.5430921 0.01258290
##  [5,] 4.809464 5.346107 0.5366437 0.01169743
##  [6,] 4.775626 5.302812 0.5271867 0.01142782
##  [7,] 4.741856 5.264109 0.5222532 0.01158757
##  [8,] 4.708943 5.228317 0.5193740 0.01165380
##  [9,] 4.676318 5.196718 0.5204000 0.01188042
## [10,] 4.649985 5.167409 0.5174246 0.01257006
fviz_gap_stat(gap_stat)

set.seed(123)
final <- kmeans(df, 3, nstart = 25)
print(final)
## K-means clustering with 3 clusters of sizes 114, 72, 105
## 
## Cluster means:
##        Horror     Romcom     Action     Comedy     Fantasy
## 1  0.04409655 -0.4249402  0.3456929  0.5053192 -0.78478335
## 2 -1.39161333  1.2100503 -1.1767013  0.8158762  1.29578460
## 3  0.90637289 -0.3683851  0.4315572 -1.1080902 -0.03648752
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   3   3   1   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   3   3   3   3   3   3   3   3   3   1   3   3   3   3   3   3   3   3   3   3 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   2   2   2   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 
##   1   1   1   1   3   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 
##   3   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 
##   2   1   3   3   1   1   1   3   1   1   1   1   1   1   1   1   1   1   1   1 
## 281 282 283 284 285 286 287 288 289 290 291 
##   1   1   1   3   1   1   1   1   3   1   2 
## 
## Within cluster sum of squares by cluster:
## [1] 205.8360 156.4432 191.4703
##  (between_SS / total_SS =  61.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
customer_movie_rating %>%
  mutate(Cluster = final$cluster) %>%
  group_by(Cluster) %>%
  summarise_all("mean")
## # A tibble: 3 x 6
##   Cluster Horror Romcom Action Comedy Fantasy
##     <int>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
## 1       1   59.6   24.0   74.3   68.0    28.3
## 2       2   24.6   51.8   50.1   75.0    74.1
## 3       3   80.7   25.0   75.7   31.4    44.8

#Hierarchical

library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization
library(dendextend) # for comparing two dendrograms
## 
## ---------------------
## Welcome to dendextend version 1.15.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
# Dissimilarity matrix
d <- dist(df, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )

# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

# Compute with agnes
hc2 <- agnes(df, method = "complete")

# Agglomerative coefficient
hc2$ac
## [1] 0.8962888
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

# function to compute coefficient
ac <- function(x) {
  agnes(df, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.8210505 0.5923947 0.8962888 0.9794408
hc3 <- agnes(df, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes") 

## DIANA

# compute divisive hierarchical clustering
hc4 <- diana(df)

# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.8831184
## [1] 0.8514345

# plot dendrogram
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")

working with the result

# Ward's method
hc5 <- hclust(d, method = "ward.D2" )

# Cut tree into 4 groups
sub_grp <- cutree(hc5, k = 4)

sub_grp <- cutree(hc5, k = 6)


?cutree

# Number of members in each cluster
table(sub_grp)
## sub_grp
##  1  2  3  4  5  6 
## 43 38 63 67 49 31
customer_movie_rating %>%
  mutate(cluster = sub_grp) %>%
  head
##   Horror Romcom Action Comedy Fantasy cluster
## 1   72.5   29.9   68.6   40.7    57.9       1
## 2   82.2   45.3   76.5   17.4    67.7       1
## 3   70.0   44.0   65.1   53.7    37.8       2
## 4   99.1   21.0   77.9   25.4    40.3       3
## 5   84.0    0.0   68.1   49.8    40.0       3
## 6   70.2   55.0   97.2   48.1    40.5       2
plot(hc5, cex = 0.6)
rect.hclust(hc5, k = 6, border = 2:5)

# Hierarchical Diana result

d2<- customer_movie_rating %>%
  mutate(cluster = sub_grp) %>%
  head

customer_movie_rating %>%
  mutate(cluster = sub_grp) %>%
  group_by(cluster) %>%
  summarise_all("mean")
## # A tibble: 6 x 6
##   cluster Horror Romcom Action Comedy Fantasy
##     <int>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
## 1       1   81.3   26.5   82.6   30.3    54.4
## 2       2   50.9   39.3   72.2   64.0    32.7
## 3       3   81.4   24.3   69.0   33.7    36.3
## 4       4   23.7   52.0   50.2   75.8    75.9
## 5       5   61.9   18.2   83.2   67.8    30.3
## 6       6   60.3   17.9   63.0   70.6    26.9

#K means result

customer_movie_rating %>%
  mutate(Cluster = final$cluster) %>%
  group_by(Cluster) %>%
  summarise_all("mean") 
## # A tibble: 3 x 6
##   Cluster Horror Romcom Action Comedy Fantasy
##     <int>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
## 1       1   59.6   24.0   74.3   68.0    28.3
## 2       2   24.6   51.8   50.1   75.0    74.1
## 3       3   80.7   25.0   75.7   31.4    44.8

Berdasarkan clustering yang dilakukan di atas, dapat disimpulkan:

  1. Optimal jumlah cluster yang optimal adalah sebesar 3 cluster atau K-3

  2. Untuk Cluster 1, rating movie tertinggi terdapat pada genre Action dan yang terendah adalah genre Romcom.

  3. Untuk Cluster 2, rating movie tertinggi terdapat pada genre Comedy dan yang terendah adalah genre Horror.

  4. Untuk Cluster 3, rating movie tertinggi terdapat pada genre Horror dan yang terendah adalah genre Romcom.

  5. Pada semua cluster yang ada, Genre Horror memperoleh rating paling tinggi sedangkan genre Romcom memperoleh rating paling rendah.